# Import custom packages & modules
import sys
sys.path.append("..")
# Univariate analysis module
from src.analyzer.univariate import *
# Multivariate analysis module
from src.analyzer.multivariate import *
# Data cleaning module
from src.datacleaner import *
# Data preprocessing module
from src.preprocessor import *
# Model evaluation module
from src.evaluator import *
# Ensemble-based methods visualization module
from src.modelizer.ensemble.tree_interpreter import *
# Model selection
from sklearn.model_selection import train_test_split
# Linear models
from sklearn.linear_model import ElasticNet, ElasticNetCV
# Non-linear models
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
# Ensemble-based methods
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
# Compute training time
import datetime
# Model evaluation wrappers
# Define project scorers (RMSE & R2)
third_project_scorers = ['neg_root_mean_squared_error', 'r2']
# Main training wrapper using GridSearchCV
def train_gridsearch(data, model, param_grid, metric=third_project_scorers, k=10, p=3, v=True):
# Model name
model_label = model_name(model)
# Get training & testing data
x_train, y_train = data['train']
x_test, y_test = data['test']
# Define refit condition (first metric if evaluationg multiple metrics else False)
refit_cond = metric[0] if type(metric) is list else False
# Build grid search
gridsearch = GridSearchCV(model, param_grid, cv=k, scoring=metric, refit=refit_cond)
# Time the model training
start_training = datetime.datetime.now()
# Train model with grid search
gridsearch.fit(x_train, y_train)
end_training = datetime.datetime.now()
# Compute training time
training_time = end_training - start_training
# Format training time
training_time_str = format_run_time(training_time)
# Trained_model
trained_model = gridsearch.best_estimator_
# Get scores from cross validation
cv_scores = {}
for scorer_label in (metric if type(metric) is list else [metric]):
if scorer_label.startswith('neg'):
formatted_label = "".join([w[0] for w in scorer_label.replace('neg_', '').split('_')])
formatted_score = round(np.abs(gridsearch.cv_results_[f'mean_test_{scorer_label}'])[0], p)
cv_scores[formatted_label] = formatted_score
else:
cv_scores[scorer_label] = round(gridsearch.cv_results_[f'mean_test_{scorer_label}'][0], p)
# Get scores from testing set
testing_set_scores = get_model_scores(trained_model, x_test, y_test, list(cv_scores.keys()), p, v)
# Display cross validation mean scores
if v:
print_score_results(cv_scores, set_type='train')
# Build model dictionary which contains GridSearchCV & model instances (with model name)
model_data = {'gs': gridsearch, # GridSearchCV trained instance
'model': trained_model, # Model trained instance
'model_name': model_label} # Model name
# Build additional evaluation data dictionary
additional_evaluation_data = {'time': training_time_str, # Training time
'n_features': x_train.shape[1], # Selected features
'learning_potential': None} # Learning potential
# Build results dictionary (merge dictionnaries)
results = dict(**model_data, **testing_set_scores, **additional_evaluation_data)
return results
def select_most_important_features(features_coefs_df,
n=None,
method='cumsum',
model=None,
thr='mean',
q_value=0.5,
thr_value=None,
v=False):
"""
Wrapper which select most important features from machine learning model, based on specified method
(cumulative threshold or)
"""
# Feature importance selection method
if method is 'cumsum': # if thr is 'cumsum'
n_mif_data = features_coefs_df.iloc[:n]
n_mif_labels = n_mif_data['feature'].tolist()
elif method is 'threshold': # else
# Threshold method
if thr is 'q': # quantile
mif_thr = features_coefs_df['coefficient'].quantile(q=q_value)
elif thr is 'mean': # mean
mif_thr = features_coefs_df['coefficient'].mean()
else: # arbitrary number
mif_thr = thr_value
# N.B : function from mlearn preprocessor module
n_mif_data = filter_features_by_threshold(X,
X_train_std,
X_test_std,
model,
mif_thr,
verbose=v)
n_mif_labels = n_mif_data['labels']
# Return n selected features dataframe & labels
return n_mif_data, n_mif_labels
def run_training_cycle(features_reduced,
model_param_grid,
model=None,
target='energy',
training_wrapper='gs',
k=5,
s=third_project_scorers,
p=3,
v=True):
"""
Wrapper which run a training cycle based on reduced features
"""
# Filter training and testing features data (reduce the number of features)
X_train_std_reduced = X_train_std_df.loc[:, features_reduced]
X_test_std_reduced = X_test_std_df.loc[:, features_reduced]
# Define target type
if target is 'energy':
y_train_target, y_test_target = y_train_energy, y_test_energy
elif target is 'emissions':
y_train_target, y_test_target = y_train_emissions, y_test_emissions
# Build training & testing data dictionary for each target
training_and_testing_data_reduced = {"train": [X_train_std_reduced, y_train_target],
"test": [X_test_std_reduced, y_test_target]}
# Define training wrapper type (elastic net cross val or gridsearchCV)
if training_wrapper == 'en':
results = train_elastic_net(training_and_testing_data_reduced,
model_param_grid)
elif training_wrapper == 'gs':
results = train_gridsearch(training_and_testing_data_reduced,
model,
model_param_grid,
metric=s,
k=k,
p=p,
v=v)
return results, training_and_testing_data_reduced
# Elastic Net Wrapper
def train_elastic_net(data, params_grid, k=10, v=True, s=['rmse', 'r2']):
# Get training and testing sets from data dictionary
X_train_std, y_train = data["train"]
X_test_std, y_test = data["test"]
# Get (hyper)parameters from parameters grid
alpha_range, l1_ratio_range = params_grid.values()
# Build model using cross validation
elastic_net = ElasticNetCV(alphas=alpha_range,
cv=k,
l1_ratio=l1_ratio_range)
start_training = datetime.datetime.now()
# Train model with cross validation
elastic_net.fit(X_train_std, y_train)
end_training = datetime.datetime.now()
# Compute training time
training_time = end_training - start_training
# Format training time
training_time_str = format_run_time(training_time)
# Optimal parameters
optimal_alpha = elastic_net.alpha_
optimal_l1_ratio = elastic_net.l1_ratio_
if v:
print("Alpha : {} | l1_ratio : {}\n".format(optimal_alpha,
optimal_l1_ratio))
# Regularization path
alphas, coefs, _ = elastic_net.path(X_train_std,
y_train,
alphas=alpha_range,
l1_ratio=optimal_l1_ratio)
# Format scorer label (example : neg_root_mean_squared_error --> rmse)
formatted_s = []
for scorer_label in s:
if scorer_label.startswith('neg'):
scorer_label = "".join([w[0] for w in scorer_label.replace('neg_', '').split('_')])
formatted_s.append(scorer_label)
# Model evaluation
testing_set_scores = get_model_scores(elastic_net,
X_test_std,
y_test,
scorer=formatted_s,
verbose=v)
# Build model dictionary which contains model data (instance & name)
model_data = {'model': elastic_net,
'model_name': model_name(elastic_net)}
# Build additional evaluation data dictionary
additional_evaluation_data = {'time': training_time_str, # Training time
'n_features': X_train_std.shape[1], # Selected features
'learning_potential': None, # Learning potential
'best_params': [optimal_alpha, optimal_l1_ratio],
'reg_path_data': [alphas, coefs]}
# Build results dictionary (merge dictionnaries)
results = dict(**model_data, **testing_set_scores, **additional_evaluation_data)
return results
# Import du dataset
df_raw = pd.read_csv('../data/csv/seattle_model_data_ENERGYSTARScore.csv')
df = df_raw.copy()
print(df.shape)
df.head(2)
# Targets
targets_cols = ['SiteEnergyUse(kBtu)', 'TotalGHGEmissions']
# Features
features_cols = [col for col in df.columns if col not in targets_cols]
# Features data
df_without_targets = df[features_cols]
# N.B : functions from mlearn prepocessor module
# Method :
# null_variance_cols = features_with_null_variances(df, verbose=True)
# identical_variance_cols = features_with_identical_variances(df, col_kept='last', verbose=True)
# invalid_variances_cols = null_variance_cols + identical_variance_cols
# df = df[[col for col in df.columns if col not in invalid_variances_cols]]
# Remove invalid features from dataframe
df = filter_invalid_variances(df, feature_kept='last')
# N.B : functions from mlearn prepocessor module
# Get filtered features dataframe
features = filter_correlated_features(df_without_targets, threshold=0.5, verbose=True)
# features = df_without_targets
features.shape
# Remove energy variables
# delete_cols(features, ['Main_energy_electricity', 'Main_energy_steam'])
# Features data
X = features
# Extract features labels
training_features = X.columns.tolist()
# Targets data
y = df[targets_cols]
# Training & testing sets split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Split targets (energy consumption & gaz emissions features)
y_train_energy, y_train_emissions = [y_train[target] for target in targets_cols]
y_test_energy, y_test_emissions = [y_test[target] for target in targets_cols]
# Standardize data (center & reduce)
# N.B : function from mlearn preprocessor module
X_train_std, std_scaler = standard_scaler(X_train, return_std_scaler=True)
X_test_std = std_scaler.transform(X_test)
# Build training & testing data dictionary for each target
energy_data = {"train": [X_train_std, y_train_energy],
"test": [X_test_std, y_test_energy]}
emissions_data = {"train": [X_train_std, y_train_emissions],
"test": [X_test_std, y_test_emissions]}
# Build dataframes from standardized training & testing data
# => permet de faciliter le filtrage des features qui contribuent le plus au model
X_train_std_df = pd.DataFrame(X_train_std, columns=training_features)
X_test_std_df = pd.DataFrame(X_test_std, columns=training_features)
# Build dummy regression with mean as strategy
y_pred_dum_energy, y_test_dum_energy = dummy_regression(X_train_std,
y_train_energy,
X_test_std,
y_test_energy,
strategy='mean')
print('Dummy regression : RMSE = {}'.format(mean_squared_error(y_test_energy,
y_pred_dum_energy,
squared=False)))
print('Dummy regression : R2 = {}'.format(r2_score(y_test_energy, y_pred_dum_energy)))
y_pred_dum_emissions, y_test_dum_emissions = dummy_regression(X_train_std,
y_train_emissions,
X_test_std,
y_test_emissions,
strategy='mean')
print('Dummy regression : RMSE = {}'.format(mean_squared_error(y_test_emissions,
y_pred_dum_emissions,
squared=False)))
print('Dummy regression : R2 = {}'.format(r2_score(y_test_emissions, y_pred_dum_emissions)))
# alphas range : np.logspace(-3, 3, 10), np.arange(0.001, 0.009, 0.001)
en_params_grid = {"alphas": np.logspace(-3, 3, 10),
"l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}
# Train Elastic Net with cross validation (result is a dictionary)
enet_en_data = train_elastic_net(energy_data, en_params_grid, k=10)
# Get hyperparameter & coefficient ranges
en_alphas, en_coefs = enet_en_data["reg_path_data"]
# N.B : function from mlearn evaluator module
plot_regularization_path(en_alphas, en_coefs.T, training_features, n_features_labels=10)
# N.B : Elastic Net reduces the coefficients of irrelevant features to 0.
# We have therefore selected here features with positive or negative coefficients
elastic_net_en_coefs = enet_en_data["model"].coef_
# Extract feature importance filtered coefficients dataframes
elastic_net_en_features_coefs_df = get_features_importance(training_features,
elastic_net_en_coefs,
abs_coefs=True,
non_zero_coefs=True,
verbose=True)
# N.B : function from mlearn preprocessor module
plot_cumulative_features_importance(elastic_net_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
# Select the 43 most important features
enet_en_mif_43_data = elastic_net_en_features_coefs_df.iloc[:43]
enet_en_mif_43_labels = enet_en_mif_43_data['feature'].tolist()
# Visualize n most important features
# N.B : function from mlearn preprocessor module
plot_n_top_features(enet_en_mif_43_data,
model_name(enet_en_data["model"]),
n=10,
plot_size=(12, 4))
# Second training cycle with features reduced (173 -> 43)
X_train_std_en_43f = X_train_std_df.loc[:, enet_en_mif_43_labels]
X_test_std_en_43f = X_test_std_df.loc[:, enet_en_mif_43_labels]
# Build training & testing data dictionary for each target
en_data_reduced_43f = {"train": [X_train_std_en_43f, y_train_energy],
"test": [X_test_std_en_43f, y_test_energy]}
enet_en_reduced_data_43f = train_elastic_net(en_data_reduced_43f, en_params_grid)
# We use the same model (from first training cycle)
mif_q3 = elastic_net_en_features_coefs_df['coefficient'].quantile(q=0.75)
ffd = filter_features_by_threshold(X, X_train_std, X_test_std, enet_en_data['model'], mif_q3)
print('Features selected : {}'.format(len(ffd['labels'])))
enet_en_mif_29_labels = ffd['labels']
# Second training cycle with features reduced (173 -> 29)
X_train_std_en_29f = X_train_std_df.loc[:, enet_en_mif_29_labels]
X_test_std_en_29f = X_test_std_df.loc[:, enet_en_mif_29_labels]
# Build training & testing data dictionary for each target
en_data_reduced_29f = {"train": [X_train_std_en_29f, y_train_energy],
"test": [X_test_std_en_29f, y_test_energy]}
enet_en_reduced_data_29f = train_elastic_net(en_data_reduced_29f, en_params_grid)
# Cumulative feature importance selection method seems to be slightly better
# (fewer variables selected and a slightly better r2)
# Plot training curve
# N.B : function from mlearn evaluator module
plot_validation_curve(ElasticNet(alpha=0.001, l1_ratio=0.2),
X_train_std_en_29f,
y_train_energy,
'alpha',
np.logspace(-3, 3, 10),
log_scale=True,
scorer='neg_root_mean_squared_error')
# Plot learning curve
# N.B : function from mlearn evaluator module
plot_learning_curve(ElasticNet(alpha=0.001, l1_ratio=0.2, max_iter=2000),
X_train_std_en_29f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : No
# Model performance seems to stagnate from around 2 000 observations
enet_en_reduced_data_29f['learning_potential'] = 'No'
N.B : From here we will use wrappers functions in order to :
# Train model
em_params_grid = {"alphas": np.logspace(-3, 3, 10),
"l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}
enet_em_data = train_elastic_net(emissions_data, em_params_grid, k=10)
em_alphas, em_coefs = enet_em_data["reg_path_data"]
plot_regularization_path(em_alphas, em_coefs.T, training_features, n_features_labels=10)
elastic_net_em_coefs = enet_em_data["model"].coef_
elastic_net_em_features_coefs_df = get_features_importance(training_features,
elastic_net_em_coefs,
abs_coefs=True,
non_zero_coefs=True,
verbose=True)
plot_cumulative_features_importance(elastic_net_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
enet_em_mif_51_data, enet_em_mif_51_labels = select_most_important_features(elastic_net_em_features_coefs_df,
n=51,
method='cumsum')
# Visualize n most important features
plot_n_top_features(enet_em_mif_51_data,
model_name(enet_em_data["model"]),
n=10,
plot_size=(12, 4))
# Second training cycle with features reduced (173 -> 69)
# em_params_grid = {"alphas": np.arange(0.0020, 0.0030, 0.0001),
# "l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}
enet_em_reduced_data_51f, train_and_test_data_reduced_51f = run_training_cycle(enet_em_mif_51_labels,
em_params_grid,
training_wrapper='en')
enet_em_mif_n_data, enet_em_mif_n_labels = select_most_important_features(elastic_net_em_features_coefs_df,
method='threshold',
model=enet_em_data["model"],
thr='q',
q_value=0.75, # Q3
v=True)
# # Visualize n most important features
# plot_n_top_features(mif_n_data['data'],
# model_name(enet_emissions_data["model"]),
# n=10,
# plot_size=(12, 4))
# Second training cycle with features reduced (173 -> 28)
enet_em_reduced_data_28f, train_and_test_data_reduced_28f = run_training_cycle(enet_em_mif_n_labels,
em_params_grid,
training_wrapper='en')
# Plot training curve
x_train_std_em_28f = train_and_test_data_reduced_28f['train'][0]
plot_validation_curve(ElasticNet(alpha=0.001, l1_ratio=0.2),
x_train_std_em_28f,
y_train_emissions,
'alpha',
np.logspace(-3, 3, 10),
log_scale=True,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(ElasticNet(alpha=0.001, l1_ratio=0.2, max_iter=2000),
x_train_std_em_28f,
y_train_emissions,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : No
# The curve stagnates around 2500 observations
enet_em_reduced_data_28f['learning_potential'] = 'No'
%%time
# Random Forest
rfr_model = RandomForestRegressor(random_state=42)
# n_estimators range | max_depth range
# np.arange(200, 550, 50) | [2, 4, 6, 8, 10]
# [400, 500, 600]
# Optimal hyperparameters
rfr_en_params = {'n_estimators': [100], # 100
# 'max_depth': [40],
# 'max_features': ['auto', 'sqrt', 'log2']
'min_samples_leaf': [1]}
rfr_en_data = train_gridsearch(energy_data,
rfr_model,
rfr_en_params,
k=10)
# print(rfr_en_data['model'])
"""
- RMSE = 0.333
- R2 = 0.915
"""
rfr_en_model = rfr_en_data['model']
rfr_en_coefs = rfr_en_model.feature_importances_
rfr_en_features_coefs_df = get_features_importance(training_features,
rfr_en_coefs,
abs_coefs=True,
non_zero_coefs=False,
verbose=False)
plot_cumulative_features_importance(rfr_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
rfr_en_mif_7_data, rfr_en_mif_7_labels = select_most_important_features(rfr_en_features_coefs_df,
n=7,
method='cumsum')
# Visualize n most important features
plot_n_top_features(rfr_en_mif_7_data,
model_name(rfr_en_model),
n=7,
plot_size=(12, 4))
# Second training cycle with features reduced (173 -> 7)
rfr_en_model_c2 = RandomForestRegressor(random_state=42)
rfr_en_params = {'n_estimators': [100],
# 'max_depth': [40],
# 'max_features': ['auto', 'sqrt', 'log2']
'min_samples_leaf': [1]}
rfr_en_reduced_data_7f, train_and_test_data_reduced_7f = run_training_cycle(rfr_en_mif_7_labels,
rfr_en_params,
rfr_en_model_c2)
# Display naiv decision tree
rfr_en_naiv_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=3)
rfr_en_naiv_model.fit(*train_and_test_data_reduced_7f['train'])
display_decision_tree(rfr_en_naiv_model,
rfr_en_mif_7_labels,
targets_cols[0],
tree_nb=5)
rfr_en_mif_n_data, rfr_en_mif_n_labels = select_most_important_features(rfr_en_features_coefs_df,
method='threshold',
model=rfr_en_model,
thr='q',
q_value=0.75, # Q3
v=True)
# Second training cycle with features reduced (173 -> 32)
rfr_en_model_c2 = RandomForestRegressor(random_state=42)
rfr_en_reduced_data_32f, train_and_test_data_reduced_32f = run_training_cycle(rfr_en_mif_n_labels,
rfr_en_params,
rfr_en_model_c2)
# Display naiv decision tree
rfr_en_naiv_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=3)
rfr_en_naiv_model.fit(*train_and_test_data_reduced_32f['train'])
display_decision_tree(rfr_en_naiv_model,
rfr_en_mif_n_labels,
targets_cols[0],
tree_nb=5)
# Plot training curve
x_train_std_en_7f = train_and_test_data_reduced_7f['train'][0]
plot_validation_curve(RandomForestRegressor(n_estimators=100, random_state=42),
x_train_std_en_7f,
y_train_energy,
'n_estimators',
np.arange(200, 500, 200),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(RandomForestRegressor(n_estimators=100, min_samples_leaf=1, random_state=42),
x_train_std_en_7f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
rfr_en_reduced_data_7f['learning_potential'] = 'Yes'
%%time
# Random Forest
rfr_model = RandomForestRegressor(random_state=42) #,n_jobs=-1)
# n_estimators range | max_depth range
# np.arange(200, 550, 50) | [2, 4, 6, 8, 10]
# [400, 500, 600]
rfr_em_params = {'n_estimators': [200], # 350 3min
# 'max_depth': [40],
# 'max_features': ['auto', 'sqrt', 'log2']
'min_samples_leaf': [1]}
rfr_em_data = train_gridsearch(emissions_data,
rfr_model,
rfr_em_params,
k=10)
print(rfr_em_data['model'])
rfr_em_model = rfr_em_data['model']
rfr_em_coefs = rfr_em_model.feature_importances_
rfr_em_features_coefs_df = get_features_importance(training_features,
rfr_em_coefs,
abs_coefs=True,
non_zero_coefs=False,
verbose=False)
plot_cumulative_features_importance(rfr_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
rfr_em_mif_8_data, rfr_em_mif_8_labels = select_most_important_features(rfr_em_features_coefs_df,
n=8,
method='cumsum')
# Visualize n most important features
plot_n_top_features(rfr_em_mif_8_data,
model_name(rfr_em_model),
n=8,
plot_size=(12, 4))
# Second training cycle with features reduced (173 -> 8)
rfr_em_model_c2 = RandomForestRegressor(random_state=42, n_jobs=-1)
rfr_em_reduced_data_8f, train_and_test_data_reduced_8f = run_training_cycle(rfr_em_mif_8_labels,
rfr_em_params,
rfr_em_model_c2)
# Display naiv decision tree
rfr_em_naiv_model = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=3)
rfr_em_naiv_model.fit(*train_and_test_data_reduced_8f['train'])
display_decision_tree(rfr_em_naiv_model,
rfr_em_mif_8_labels,
targets_cols[1],
tree_nb=5)
rfr_em_mif_n_data, rfr_em_mif_n_labels = select_most_important_features(rfr_em_features_coefs_df,
method='threshold',
model=rfr_em_model,
thr='q',
q_value=0.75, # Q3
v=True)
# Second training cycle with features reduced (173 -> 32)
rfr_em_model_c2 = RandomForestRegressor(random_state=42)
rfr_em_reduced_data_32f, train_and_test_data_reduced_32f = run_training_cycle(rfr_em_mif_n_labels,
rfr_em_params,
rfr_em_model_c2)
# Display naiv decision tree
rfr_em_naiv_model = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=3)
rfr_em_naiv_model.fit(*train_and_test_data_reduced_32f['train'])
display_decision_tree(rfr_em_naiv_model,
rfr_em_mif_n_labels,
targets_cols[1],
tree_nb=5)
# Plot training curve
x_train_std_em_8f = train_and_test_data_reduced_8f['train'][0]
plot_validation_curve(RandomForestRegressor(n_estimators=200, random_state=42),
x_train_std_em_8f,
y_train_energy,
'n_estimators',
np.arange(200, 500, 200),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(RandomForestRegressor(n_estimators=200, random_state=42),
x_train_std_em_8f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
rfr_em_reduced_data_8f['learning_potential'] = 'Yes'
%%time
# XGBoost
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')
# xgb_reg_params = {'n_estimators': [150, 200, 250],
# 'alpha': [0.5, 2, 5],
# 'learning_rate': np.arange(0.1, 1.1, 0.1),
# 'max_depth': [5, 10, 20],
# 'colsample_bytree': [0.3],
# #'subsample': np.arange(0.1, 1.1, 0.1)
# }
xgb_reg_en_params = {'n_estimators': [500],
'alpha': [0.5], # selected from np.arange(1, 2, 0.1),
'learning_rate': [0.062], # 0.065
'max_depth': [20],
'colsample_bytree': [0.35]
}
xgb_en_data = train_gridsearch(energy_data,
xgb_reg,
xgb_reg_en_params)
"""
- RMSE = 0.276
- R2 = 0.942
"""
print(xgb_en_data['model'])
xgb_reg_en_model = xgb_en_data['model']
xgb_reg_en_coefs = xgb_reg_en_model.feature_importances_
xgb_reg_en_features_coefs_df = get_features_importance(training_features,
xgb_reg_en_coefs,
abs_coefs=True,
non_zero_coefs=False,
verbose=False)
plot_cumulative_features_importance(xgb_reg_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
xgb_reg_en_mif_29_data, xgb_reg_en_mif_29_labels = select_most_important_features(xgb_reg_en_features_coefs_df,
n=29,
method='cumsum')
# Visualize n most important features
plot_n_top_features(xgb_reg_en_mif_29_data,
model_name(xgb_reg_en_model),
n=10,
plot_size=(12, 4))
%%time
# Second training cycle with features reduced (173 -> 29)
xgb_reg_en_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_en_reduced_data_29f, train_and_test_data_reduced_29f = run_training_cycle(xgb_reg_en_mif_29_labels,
xgb_reg_en_params,
xgb_reg_en_model_c2)
# Display naiv decision tree
xgb.plot_tree(xgb_reg_en_reduced_data_29f['model'], num_trees=5)
plt.show()
xgb_reg_en_mif_n_data, xgb_reg_en_mif_n_labels = select_most_important_features(xgb_reg_en_features_coefs_df,
method='threshold',
model=xgb_reg_en_model,
thr='q',
q_value=0.75, # Q3
v=True)
# Second training cycle with features reduced (173 -> 32)
xgb_reg_en_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_en_reduced_data_32f, train_and_test_data_reduced_32f = run_training_cycle(xgb_reg_en_mif_n_labels,
xgb_reg_en_params,
xgb_reg_en_model_c2)
# Display naiv decision tree
xgb.plot_tree(xgb_reg_en_reduced_data_32f['model'], num_trees=5)
plt.show()
# Plot training curve
x_train_std_en_29f = train_and_test_data_reduced_29f['train'][0]
xgb_reg_en_params = {'objective': 'reg:squarederror',
'n_estimators': 500,
'alpha': 0.5,
'learning_rate': 0.062,
'max_depth': 20,
'colsample_bytree': 0.35}
plot_validation_curve(xgb.XGBRegressor(**xgb_reg_en_params),
x_train_std_en_29f,
y_train_energy,
'n_estimators',
np.arange(200, 800, 100),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(xgb.XGBRegressor(**xgb_reg_en_params),
x_train_std_en_29f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
xgb_reg_en_reduced_data_29f['learning_potential'] = 'Yes'
%%time
# XGBoost
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')
# xgb_reg_params = {'n_estimators': [150, 200, 250, 300],
# 'alpha': [2, 5],
# 'learning_rate': np.arange(0.1, 1.1, 0.1),
# 'max_depth': [5, 10, 20],
# 'colsample_bytree': [0.3],
# 'subsample': np.arange(0.1, 1.1, 0.1)}
xgb_reg_em_params = {'n_estimators': [500],
'alpha': [1.6], # selected from np.arange(1, 2, 0.1),
'learning_rate': [0.05],
'max_depth': [20],
'colsample_bytree': [0.35]
}
xgb_em_data = train_gridsearch(emissions_data,
xgb_reg,
xgb_reg_em_params)
print(xgb_em_data['model'])
xgb_reg_em_model = xgb_em_data['model']
xgb_reg_em_coefs = xgb_reg_em_model.feature_importances_
xgb_reg_em_features_coefs_df = get_features_importance(training_features,
xgb_reg_em_coefs,
abs_coefs=True,
non_zero_coefs=False,
verbose=False)
plot_cumulative_features_importance(xgb_reg_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
xgb_reg_em_mif_35_data, xgb_reg_em_mif_35_labels = select_most_important_features(xgb_reg_em_features_coefs_df,
n=35,
method='cumsum')
# Visualize n most important features
plot_n_top_features(xgb_reg_em_mif_35_data,
model_name(xgb_reg_em_model),
n=10,
plot_size=(12, 4))
%%time
# Second training cycle with features reduced (173 -> 35)
xgb_reg_em_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_em_reduced_data_35f, train_and_test_data_reduced_35f = run_training_cycle(xgb_reg_em_mif_35_labels,
xgb_reg_em_params,
xgb_reg_em_model_c2)
# Display naiv decision tree
xgb.plot_tree(xgb_reg_em_reduced_data_35f['model'], num_trees=5)
plt.show()
xgb_reg_em_mif_n_data, xgb_reg_em_mif_n_labels = select_most_important_features(xgb_reg_em_features_coefs_df,
method='threshold',
model=xgb_reg_em_model,
thr='q',
q_value=0.75, # Q3
v=True)
# Second training cycle with features reduced (173 -> 32)
xgb_reg_em_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_em_reduced_data_32f, train_and_test_data_reduced_32f = run_training_cycle(xgb_reg_em_mif_n_labels,
xgb_reg_em_params,
xgb_reg_em_model_c2)
# Display naiv decision tree
xgb.plot_tree(xgb_reg_em_reduced_data_32f['model'], num_trees=5)
plt.show()
# Plot training curve
x_train_std_em_32f = train_and_test_data_reduced_32f['train'][0]
xgb_reg_em_params = {'objective': 'reg:squarederror',
'n_estimators': 500,
'alpha': 1.6,
'learning_rate': 0.05,
'max_depth': 20,
'colsample_bytree': 0.35}
plot_validation_curve(xgb.XGBRegressor(**xgb_reg_em_params),
x_train_std_em_32f,
y_train_emissions,
'n_estimators',
np.arange(200, 800, 200),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(xgb.XGBRegressor(**xgb_reg_em_params),
x_train_std_em_32f,
y_train_emissions,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
xgb_reg_em_reduced_data_32f['learning_potential'] = 'Yes'
# Get union from best models selected features for each target
# Energy best selected features
# Merge best selected features from models
en_total_best_features = enet_en_mif_29_labels + rfr_en_mif_7_labels + xgb_reg_en_mif_n_labels
# Extract best unique selected features from merged list
en_best_features = list(set(en_total_best_features))
# Emissions best selected features
# Merge best selected features from models
em_total_best_features = enet_em_mif_n_labels + rfr_em_mif_8_labels + xgb_reg_em_mif_n_labels
# Extract best unique selected features from merged list
em_best_features = list(set(em_total_best_features))
# Display best features count for each target
print('Energy target has {} best features'.format(len(en_best_features)))
print('Emissions target has {} best features'.format(len(em_best_features)))
Nonlinear models do not allow to measure in a simple way feature contributions from models
We will therefore proceed here differently by:
%%time
kernel_svr_model = SVR(kernel='rbf')
kernel_svr_en_params = {'C': [50], # 250 # np.arange(50, 550, 50), np.logspace(-3, 3, 10)
'gamma': [0.001],
'epsilon': [0.1]}
kernel_svr_en_data = train_gridsearch(energy_data,
kernel_svr_model,
kernel_svr_en_params)
kernel_svr_en_model = kernel_svr_en_data['model']
kernel_svr_en_model
%%time
# Second training cycle with features reduced
kernel_svr_en_model = SVR(kernel='rbf')
# We test the different sets of selected features for each model by also adding the union of these
en_selected_feature_sets_dict = {'ElasticNet': enet_en_mif_29_labels,
'RandomForestRegressor': rfr_en_mif_7_labels,
'XGBoostRegressor': xgb_reg_en_mif_n_labels,
'Union of features selected by each model': en_best_features}
def run_training_cycle_for_each_feature_set(feature_sets, model, model_param_grid):
# Second cycle results dictionary
second_cycle_results = {'rmse': [],
'r2': [],
'time': [],
'total selected features': [],
'selected features by': []}
# Run second training cycle for each selected features lists
for model_of_feature_set, selected_features in feature_sets.items():
model_reduced_data_nf, train_and_test_data_reduced_nf = run_training_cycle(selected_features,
model_param_grid,
model,
v=False)
second_cycle_results['rmse'].append(model_reduced_data_nf['rmse'])
second_cycle_results['r2'].append(model_reduced_data_nf['r2'])
second_cycle_results['time'].append(model_reduced_data_nf['time'])
second_cycle_results['total selected features'].append(len(selected_features))
second_cycle_results['selected features by'].append(model_of_feature_set)
# Build second cycle results
second_cycle_results_df = pd.DataFrame(second_cycle_results)
return second_cycle_results_df
kernel_svr_en_second_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
kernel_svr_en_model,
kernel_svr_en_params)
# Best performances for each feature set
kernel_svr_en_second_cycle_results_df
# Corresponds to 10 variables from original dataset
enet_en_mif_29_labels
# Corresponds to 7 variables from original dataset
rfr_en_mif_7_labels
# Less than 10 % per variable from difference (10 - 7) between ElasticNet and Random Forest feature selections
percentage_change(0.466, 0.346)
# Adding 3 variables compared to Random Forest total feature selection (7)
# increases the total amount of variables required by approximately 43%
percentage_change(7, 10)
# Train best Kernel SVR model with appropriate selected features list
# Best model is selected by arbitrating between feature total and score (RMSE)
# Fewer features mean less data acquisition costs
# we select Random Forest feature selection
kernel_svr_en_reduced_data_7f, train_and_test_data_reduced_7f = run_training_cycle(rfr_en_mif_7_labels,
kernel_svr_en_params,
kernel_svr_en_model,
v=False)
x_train_std_en_7f = train_and_test_data_reduced_7f['train'][0]
kernel_svr_en_params = {'kernel': 'rbf',
'C': 50,
'gamma': 0.001,
'epsilon': 0.1}
plot_validation_curve(SVR(**kernel_svr_en_params),
x_train_std_en_7f,
y_train_energy,
'C',
np.arange(1, 100, 10),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
# N.B : function from mlearn evaluator module
plot_learning_curve(SVR(**kernel_svr_en_params),
x_train_std_en_7f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : No
kernel_svr_en_reduced_data_7f['learning_potential'] = 'No'
%%time
kernel_svr_model = SVR(kernel='rbf')
kernel_svr_em_params = {'C': [50], # 300
'gamma': [0.001], # 0.01
'epsilon': [0.1]}
kernel_svr_em_data = train_gridsearch(emissions_data,
kernel_svr_model,
kernel_svr_em_params)
kernel_svr_em_data['model']
%%time
# Second training cycle with features reduced
kernel_svr_em_model = SVR(kernel='rbf')
# We test the different sets of selected features for each model by also adding the union of these
em_selected_feature_sets_dict = {'ElasticNet': enet_em_mif_n_labels,
'RandomForestRegressor': rfr_em_mif_8_labels,
'XGBoostRegressor': xgb_reg_em_mif_n_labels,
'Union of features selected by each model': em_best_features}
kernel_svr_em_second_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
kernel_svr_em_model,
kernel_svr_em_params)
# Best performances for each feature set
kernel_svr_em_second_cycle_results_df
# Corresponds to 11 variables from original dataset
enet_em_mif_n_labels
# Corresponds to 8 variables from original dataset
rfr_em_mif_8_labels
# Less than 8 % per variable from difference (11 - 8) between ElasticNet and Random Forest feature selections
percentage_change(0.456, 0.348)
# Adding 3 variables compared to Random Forest total feature selection (7)
# increases the total amount of variables required by approximately 38%
percentage_change(8, 11)
# Train best Kernel SVR model with appropriate selected features list
# Best model is selected by arbitrating between feature total and score (RMSE)
# Fewer features mean less data acquisition costs
# we select Random Forest feature selection
kernel_svr_em_reduced_data_8f, train_and_test_data_reduced_8f = run_training_cycle(rfr_em_mif_n_labels,
kernel_svr_em_params,
kernel_svr_em_model,
v=False)
x_train_std_em_8f = train_and_test_data_reduced_8f['train'][0]
kernel_svr_em_params = {'kernel': 'rbf',
'C': 50,
'gamma': 0.001,
'epsilon': 0.1}
plot_validation_curve(SVR(**kernel_svr_em_params),
x_train_std_em_8f,
y_train_emissions,
'C',
np.arange(1, 100, 10),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
# N.B : function from mlearn evaluator module
plot_learning_curve(SVR(**kernel_svr_em_params),
x_train_std_em_8f,
y_train_emissions,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : No
kernel_svr_em_reduced_data_8f['learning_potential'] = 'No'
%%time
# Multi-Layer Perceptron Regressor
mlpr_model = MLPRegressor(activation='relu', #'identity',
learning_rate='adaptive',
alpha=4,
max_iter=5000,
verbose=False,
random_state=42)
mlpr_en_params = {'hidden_layer_sizes': (16, 3), # pow(16, 3) < df.shape[0]
'learning_rate_init': [0.0008]} # 0.001
mlpr_en_data = train_gridsearch(energy_data,
mlpr_model,
mlpr_en_params)
mlpr_en_data['model']
%%time
# Second training cycle with features reduced
mlpr_en_model = MLPRegressor(activation='relu',
learning_rate='adaptive',
alpha=4,
max_iter=5000,
verbose=False,
random_state=42)
# Build second cycle results
mlpr_en_second_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
mlpr_en_model,
mlpr_en_params)
# Best performances for each feature set
mlpr_en_second_cycle_results_df
mlpr_en_reduced_data_7f, train_and_test_data_reduced_7f = run_training_cycle(rfr_en_mif_7_labels,
mlpr_en_params,
mlpr_en_model,
v=False)
x_train_std_en_7f = train_and_test_data_reduced_7f['train'][0]
mlpr_en_params = {'activation': 'relu',
'learning_rate': 'adaptive',
'alpha': 4,
'max_iter': 5000,
'hidden_layer_sizes': (16, 3),
'learning_rate_init': 0.0008,
'random_state': 42}
plot_validation_curve(MLPRegressor(**mlpr_en_params),
x_train_std_en_7f,
y_train_energy,
'learning_rate_init',
np.arange(0.0001, 0.001, 0.0001),
log_scale=False,
scorer='neg_root_mean_squared_error')
# 0.0002 seems to be a better learning rate
# Run another training cycle with a learning rate = 0.0002
mlpr_en_model = MLPRegressor(activation='relu',
learning_rate='adaptive',
alpha=4,
max_iter=10000,
verbose=False,
random_state=42)
mlpr_en_params = {'hidden_layer_sizes': (16, 3),
'learning_rate_init': [0.0001]}
mlpr_en_reduced_data_7f, train_and_test_data_reduced_7f = run_training_cycle(rfr_en_mif_7_labels,
mlpr_en_params,
mlpr_en_model,
v=True)
# Plot learning curve
x_train_std_en_7f = train_and_test_data_reduced_7f['train'][0]
mlpr_en_params = {'activation': 'relu',
'learning_rate': 'adaptive',
'alpha': 4,
'max_iter': 50000,
'hidden_layer_sizes': (16, 3),
'learning_rate_init': 0.0001,
'random_state': 42}
# N.B : function from mlearn evaluator module
plot_learning_curve(MLPRegressor(**mlpr_en_params),
x_train_std_en_7f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : No
mlpr_en_reduced_data_7f['learning_potential'] = 'No'
%%time
mlpr_model = MLPRegressor(activation='relu', #'identity',
learning_rate='adaptive',
alpha=4,
max_iter=5000,
verbose=False,
random_state=42)
mlpr_em_params = {'hidden_layer_sizes': (16, 3), # pow(16, 3) < df.shape[0]
'learning_rate_init': [0.00075]} # 0.001
mlpr_em_data = train_gridsearch(emissions_data,
mlpr_model,
mlpr_em_params)
mlpr_em_data['model']
%%time
# Second training cycle with features reduced
mlpr_em_model = MLPRegressor(activation='relu',
learning_rate='adaptive',
alpha=4,
max_iter=5000,
verbose=False,
random_state=42)
# Build second cycle results
mlpr_em_second_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
mlpr_em_model,
mlpr_em_params)
mlpr_em_second_cycle_results_df
# Best performances came from en_best_features (energy selected features union)
mlpr_em_reduced_data_8f, train_and_test_data_reduced_8f = run_training_cycle(rfr_em_mif_8_labels,
mlpr_em_params,
mlpr_em_model,
v=False)
x_train_std_em_8f = train_and_test_data_reduced_8f['train'][0]
mlpr_em_params = {'activation': 'relu',
'learning_rate': 'adaptive',
'alpha': 4,
'max_iter': 50000,
'hidden_layer_sizes': (16, 3),
'learning_rate_init': 0.00075,
'random_state': 42}
plot_validation_curve(MLPRegressor(**mlpr_em_params),
x_train_std_em_8f,
y_train_emissions,
'learning_rate_init',
np.arange(0.0001, 0.001, 0.0001),
log_scale=False,
scorer='neg_root_mean_squared_error')
# 0.0003 seems to be a better value for learning rate
mlpr_em_model = MLPRegressor(activation='relu',
learning_rate='adaptive',
alpha=4,
max_iter=5000,
verbose=False,
random_state=42)
mlpr_em_params = {'hidden_layer_sizes': (16, 3),
'learning_rate_init': [0.0003]}
mlpr_em_reduced_data_28f, train_and_test_data_reduced_28f = run_training_cycle(enet_em_mif_n_labels,
mlpr_em_params,
mlpr_em_model,
v=True)
# Plot learning curve
x_train_std_em_8f = train_and_test_data_reduced_8f['train'][0]
mlpr_em_params = {'activation': 'relu',
'learning_rate': 'adaptive',
'alpha': 4,
'max_iter': 50000,
'hidden_layer_sizes': (16, 3),
'learning_rate_init': 0.0003,
'random_state': 42}
# N.B : function from mlearn evaluator module
plot_learning_curve(MLPRegressor(**mlpr_em_params),
x_train_std_em_8f,
y_train_emissions,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
mlpr_em_reduced_data_8f['learning_potential'] = 'Yes'
%%time
# Third training cycle with features reduced
rfr_en_model = RandomForestRegressor(random_state=42)
rfr_en_params = {'n_estimators': [100],
'min_samples_leaf': [1]}
rfr_en_third_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
rfr_en_model,
rfr_en_params)
rfr_en_third_cycle_results_df
%%time
# Third training cycle with features reduced
rfr_em_model = RandomForestRegressor(random_state=42)
rfr_em_params = {'n_estimators': [200],
'min_samples_leaf': [1]}
rfr_em_third_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
rfr_em_model,
rfr_em_params)
rfr_em_third_cycle_results_df
%%time
# Third training cycle with features reduced
xgb_reg_en_model = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_en_params = {'n_estimators': [500],
'alpha': [0.5],
'learning_rate': [0.062],
'max_depth': [20],
'colsample_bytree': [0.35]
}
xgb_reg_en_third_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
xgb_reg_en_model,
xgb_reg_en_params)
xgb_reg_en_third_cycle_results_df
xgb_reg_en_model = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_en_reduced_data_7f, train_and_test_data_reduced_7f = run_training_cycle(rfr_en_mif_7_labels,
xgb_reg_en_params,
xgb_reg_en_model,
v=False)
# Plot training curve
x_train_std_en_7f = train_and_test_data_reduced_7f['train'][0]
xgb_reg_en_params = {'objective': 'reg:squarederror',
'n_estimators': 500,
'alpha': 0.5,
'learning_rate': 0.062,
'max_depth': 20,
'colsample_bytree': 0.35}
plot_validation_curve(xgb.XGBRegressor(**xgb_reg_en_params),
x_train_std_en_7f,
y_train_energy,
'n_estimators',
np.arange(200, 800, 100),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(xgb.XGBRegressor(**xgb_reg_en_params),
x_train_std_en_7f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
xgb_reg_en_reduced_data_7f['learning_potential'] = 'Yes'
%%time
# Third training cycle with features reduced
xgb_reg_em_model = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_em_params = {'n_estimators': [500],
'alpha': [1.6],
'learning_rate': [0.05],
'max_depth': [20],
'colsample_bytree': [0.35]
}
xgb_reg_em_third_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
xgb_reg_em_model,
xgb_reg_em_params)
xgb_reg_em_third_cycle_results_df
xgb_reg_em_model = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_em_reduced_data_8f, train_and_test_data_reduced_8f = run_training_cycle(rfr_em_mif_8_labels,
xgb_reg_em_params,
xgb_reg_em_model,
v=False)
# Plot training curve
x_train_std_em_8f = train_and_test_data_reduced_8f['train'][0]
xgb_reg_em_params = {'objective': 'reg:squarederror',
'n_estimators': 500,
'alpha': 1.6,
'learning_rate': 0.05,
'max_depth': 20,
'colsample_bytree': 0.35}
plot_validation_curve(xgb.XGBRegressor(**xgb_reg_em_params),
x_train_std_em_8f,
y_train_emissions,
'n_estimators',
np.arange(200, 800, 100),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(xgb.XGBRegressor(**xgb_reg_em_params),
x_train_std_em_8f,
y_train_emissions,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
xgb_reg_em_reduced_data_8f['learning_potential'] = 'Yes'
# Best models for energy target (model data dictionaries from train_gridsearch wrapper)
best_en_models = [enet_en_reduced_data_29f,
rfr_en_reduced_data_7f,
xgb_reg_en_reduced_data_7f,
kernel_svr_en_reduced_data_7f,
mlpr_en_reduced_data_7f]
# Best models for emissions target (model data dictionaries from train_gridsearch wrapper)
best_em_models = [enet_em_reduced_data_28f,
rfr_em_reduced_data_8f,
xgb_reg_em_reduced_data_8f,
kernel_svr_em_reduced_data_8f,
mlpr_em_reduced_data_8f]
# Build main evaluation variables list from a random model data dictionary
model_data_keys = dict(list(rfr_en_reduced_data_7f.items())[2:])
# --> ['model_name', 'rmse', 'r2', 'time', 'n_features', 'learning_potential']
# Build main evaluation variables for result dataframes
training_results_keys = ['Model', 'RMSE', 'R2', 'Run time', 'Selected features', 'Learning potential']
# Build result dictionaries for each target
energy_training_results = {k: [] for k in training_results_keys}
emissions_training_results = {k: [] for k in training_results_keys}
# Fill result dictionaries with model data for each target
for en_model, em_model in zip(best_en_models, best_em_models):
for k1, k2 in zip(training_results_keys, model_data_keys):
energy_training_results[k1].append(en_model[k2])
emissions_training_results[k1].append(em_model[k2])
target_dicts = [energy_training_results, emissions_training_results]
# Build result dataframes for each target
en_results_df, em_results_df = [pd.DataFrame(target_dict) for target_dict in target_dicts]
en_results_df
# Best model XGBRegressor
em_results_df
# Best model XGBRegressor
# Energy target ('SiteEnergyUse(kBtu)') results
en_results_data = {'en_results_df': en_results_df,
'en_best_model': xgb_reg_en_reduced_data_7f,
'en_best_features': rfr_en_mif_7_labels}
# Emissions target ('TotalGHGEmissions') results
em_results_data = {'em_results_df': em_results_df,
'em_best_model': xgb_reg_em_reduced_data_8f,
'em_best_features': rfr_em_mif_8_labels}
# Merge target results into another dictionary
results_data = {'en': en_results_data, 'em': em_results_data}
# Save it as a .pkl file
pickle_data(filename='main_results_ENERGYSTARScore',
folder='../data/pkl',
data=results_data,
method='w')
# Check if the selected features are identical from one target to another
rfr_en_mif_7_labels == rfr_em_mif_8_labels
# The selected features are different from one target to another
rfr_en_mif_7_labels
# Plot feature importance from best model
xgb.plot_importance(xgb_reg_en_reduced_data_7f['model'])
# Total number of variables required : 7
rfr_em_mif_8_labels
xgb.plot_importance(xgb_reg_em_reduced_data_8f['model'])
# Total number of variables required : 8